{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "af956bc0",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run -i \"../util/file_utils.ipynb\"\n",
    "%run -i \"../util/lang_utils.ipynb\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d07e59ec",
   "metadata": {},
   "source": [
    "# Remove stopwords using NLTK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4f7ea17a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import stopwords\n",
    "# nltk.download('stopwords') Run the first time\n",
    "print(stopwords.words('english'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b24163a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "230\n"
     ]
    }
   ],
   "source": [
    "text = read_text_file(\"../data/sherlock_holmes_1.txt\")\n",
    "words = word_tokenize_nltk(text)\n",
    "print(len(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "967bfbd4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "118\n"
     ]
    }
   ],
   "source": [
    "words = [word for word in words if word.lower() not in stopwords.words(\"english\")]\n",
    "print(len(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cc21fe23",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "179\n",
      "180\n"
     ]
    }
   ],
   "source": [
    "# Add more stopwords\n",
    "stop_words = stopwords.words(\"english\")\n",
    "print(len(stop_words))\n",
    "stop_words.append(\"here\")\n",
    "print(len(stop_words))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c802e386",
   "metadata": {},
   "source": [
    "# Remove stopwords using spaCy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5a6eba63",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'then', '’s', 'full', 'out', '’m', 'again', 'among', 'four', 'may', 'five', 'that', 'many', 'much', 'been', 'two', 'on', 'i', 'really', 'moreover', 'anyhow', 'yet', 'else', 'who', 'everywhere', 'of', 'throughout', 'three', \"n't\", 'also', 'before', 'beside', 'us', 'only', \"'ve\", 'regarding', 'fifteen', 'go', 'could', 'being', 'after', 'see', 'seemed', 'thereafter', 'against', 'you', 'yourselves', 'third', 'former', 'whole', 'herself', 'either', \"'s\", 'whither', 'no', 'thereby', 'what', 'some', 'already', \"'re\", 'nobody', 'themselves', 'where', 'everyone', 'because', 'first', 'himself', '’ve', 'hundred', '‘re', 'rather', 'due', 'whoever', 'became', 'thus', 'mostly', 're', 'so', 'why', 'last', 'beforehand', 'perhaps', 'mine', 'neither', 'behind', 'n’t', 'until', 'might', 'he', 'say', 'myself', 'down', 'empty', 'formerly', 'whereas', 'bottom', 'thru', 'while', 'twelve', 'him', 'well', 'wherever', 'as', 'except', '‘ll', 'further', 'front', 'onto', 'doing', 'otherwise', 'has', 'her', 'one', 'twenty', 'almost', 'noone', 'put', 'not', 'through', 'would', 'this', 'must', 'via', 'yourself', 'below', 'amongst', \"'m\", 'keep', 'various', 'within', '’d', 'here', 'towards', 'whence', 'them', 'between', '’re', '‘ve', 'ours', 'thereupon', 'in', 'other', 'own', \"'d\", 'does', 'but', 'be', 'ten', 'anywhere', 'all', 'my', 'hereupon', 'whose', 'should', 'whereupon', 'though', 'these', 'latter', 'others', 'whenever', 'become', 'more', 'part', '’ll', 'very', 'afterwards', 'whereafter', 'seems', '‘m', 'using', 'those', 'hence', 'each', 'their', 'becomes', 'six', 'am', 'often', 'something', 'during', 'whom', 'hereby', 'side', 'whether', 'we', 'did', 'had', 'sometimes', 'thence', 'our', 'therein', 'therefore', 'upon', 'seeming', 'hers', 'its', 'get', 'and', 'few', 'into', 'by', 'they', 'under', 'now', 'whereby', 'somehow', 'since', 'than', 'from', 'once', 'eight', 'however', 'please', 'anyone', 'move', 'next', 'or', 'about', 'across', 'serious', 'n‘t', 'if', 'sometime', 'latterly', 'his', 'will', 'together', 'most', 'never', 'besides', 'give', 'any', 'make', 'nowhere', 'every', 'ourselves', 'yours', 'have', 'nevertheless', 'for', 'such', 'up', 'even', 'per', '‘d', 'show', 'without', 'meanwhile', 'someone', 'around', 'done', 'hereafter', 'amount', 'were', 'is', 'unless', 'both', 'there', 'namely', 'the', 'nine', 'fifty', 'sixty', 'anything', 'nothing', 'eleven', 'ever', 'enough', 'just', 'name', 'another', 'are', 'your', 'call', 'back', 'itself', 'seem', 'to', 'it', 'nor', 'everything', 'toward', 'less', 'with', 'she', 'somewhere', \"'ll\", 'which', 'still', 'at', 'alone', 'over', 'above', 'top', 'beyond', 'although', 'do', 'was', 'a', 'anyway', 'how', 'forty', 'used', 'whatever', 'none', 'least', 'along', 'same', 'can', 'several', 'wherein', 'an', 'off', 'indeed', 'quite', 'cannot', 'herein', 'ca', 'becoming', 'always', 'made', 'me', 'elsewhere', 'when', 'take', 'too', '‘s'}\n",
      "<class 'set'>\n",
      "230\n"
     ]
    }
   ],
   "source": [
    "stopwords = small_model.Defaults.stop_words\n",
    "print(stopwords)\n",
    "print(type(stopwords))\n",
    "words = word_tokenize_nltk(text)\n",
    "print(len(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b6cc9720",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "106\n"
     ]
    }
   ],
   "source": [
    "words = [word for word in words if word.lower() not in stopwords]\n",
    "print(len(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cfcd23e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "326\n",
      "327\n"
     ]
    }
   ],
   "source": [
    "# Add stopwords\n",
    "print(len(stopwords))\n",
    "stopwords.add(\"new\")\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "af5d7f84",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "327\n",
      "326\n"
     ]
    }
   ],
   "source": [
    "# Remove stopwords\n",
    "print(len(stopwords))\n",
    "stopwords.remove(\"new\")\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e2ee89d7",
   "metadata": {},
   "source": [
    "# Remove stopwords using frequency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5ea831cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.probability import FreqDist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "b8ae4955",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compile_stopwords_list_frequency(text, cut_off=0.02):\n",
    "    words = word_tokenize_nltk(text)\n",
    "    freq_dist = FreqDist(word.lower() for word in words)\n",
    "    words_with_frequencies = [(word, freq_dist[word]) for word in freq_dist.keys()]\n",
    "    sorted_words = sorted(words_with_frequencies, key=lambda tup: tup[1])\n",
    "    stopwords = []\n",
    "    if (type(cut_off) is int):\n",
    "        stopwords = [tuple[0] for tuple in sorted_words if tuple[1] > cut_off] # First option: use a frequency cutoff\n",
    "    elif (type(cut_off) is float):\n",
    "        length_cutoff = int(cut_off*len(sorted_words)) # Second option: use a percentage of the words\n",
    "        stopwords = [tuple[0] for tuple in sorted_words[-length_cutoff:]]\n",
    "    else:\n",
    "        raise TypeError(\"The cut off needs to be either a float (percentage) or an int (frequency cut off)\")\n",
    "    return stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "74841a1e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['make', 'myself', 'night', 'until', 'street', 'few', 'why', 'thought', 'take', 'friend', 'lady', 'side', 'small', 'still', 'these', 'find', 'st.', 'every', 'watson', 'too', 'round', 'young', 'father', 'left', 'day', 'yet', 'first', 'once', 'took', 'its', 'eyes', 'long', 'miss', 'through', 'asked', 'most', 'saw', 'oh', 'morning', 'right', 'last', 'like', 'say', 'tell', 't', 'sherlock', 'their', 'go', 'own', 'after', 'away', 'never', 'good', 'nothing', 'case', 'however', 'quite', 'found', 'made', 'house', 'such', 'heard', 'way', 'yes', 'hand', 'much', 'matter', 'where', 'might', 'just', 'room', 'any', 'face', 'here', 'back', 'door', 'how', 'them', 'two', 'other', 'came', 'time', 'did', 'than', 'come', 'before', 'must', 'only', 'know', 'about', 'shall', 'think', 'more', 'over', 'us', 'well', 'am', 'or', 'may', 'they', ';', 'our', 'should', 'now', 'see', 'down', 'can', 'some', 'if', 'will', 'mr.', 'little', 'who', 'into', 'do', 'has', 'could', 'up', 'man', 'out', 'when', 'would', 'an', 'are', 'by', '!', 'were', 's', 'then', 'one', 'all', 'on', 'no', 'what', 'been', 'your', 'very', 'him', 'her', 'she', 'so', '‘', 'holmes', 'upon', 'this', 'said', 'from', 'there', 'we', 'me', 'be', 'but', 'not', 'for', '?', 'at', 'which', 'with', 'had', 'as', 'have', 'my', '’', 'is', 'his', 'was', 'you', 'he', 'it', 'that', 'in', '”', 'a', 'of', 'to', '“', 'and', 'i', '.', 'the', ',']\n",
      "181\n"
     ]
    }
   ],
   "source": [
    "text = read_text_file(\"../data/sherlock_holmes.txt\")\n",
    "stopwords = compile_stopwords_list_frequency(text)\n",
    "print(stopwords)\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0d8826e8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['silence', 'married', 'late', 'week', 'led', 'girl', 'imagine', 'followed', 'dress', 'present', 'king', 'beside', 'windows', 'people', 'began', 'cab', 'held', 'square', 'goose', 'marriage', 'rooms', 'feet', 'appeared', 'none', 'called', 'white', 'john', 'drove', 'thank', 'reason', 'coronet', 'spoke', 'things', 'crime', 'family', 'manner', 'true', 'visitor', 'lost', 'taken', 'letters', 'absolutely', 'floor', 'fear', 'air', 'hard', 'coming', 'remember', 'above', 'husband', 'keep', 'used', 'clair', 'seven', 'want', 'dr.', 'walked', 'seems', 'cry', 'ten', 'red', 'letter', 'ran', 'obvious', 'stone', 'cut', 'threw', 'heavy', 'death', 'else', 'god', 'inspector', 'colonel', 'new', 'country', 'don', 'note', 'client', 'four', 'road', 'turn', 'instant', 'waiting', 'several', 'going', 'sound', 'best', 'able', 'became', 'near', 'evening', 'hope', 'forward', 'felt', 'each', 'story', 'part', 'corner', 'given', 'hear', 'laid', 'use', 'fellow', 'ah', 'strong', 'singular', 'companion', 'either', 'o', 'hour', 'entered', 'rushed', 'sister', 'dark', 'together', 'believe', 'facts', 'question', 'interest', 'll', 'five', 'among', 'stood', 'opened', 'least', 'ask', 'mccarthy', 'returned', 'london', 'read', 'moment', '£', 'save', 'across', 'rucastle', 'word', 'clock', 'doctor', 'less', 'during', 'does', 'bed', 'lestrade', 'simon', 'men', 'half', 'since', 'you.', 'paper', 'it.', 'whom', 'days', 'soon', 'mrs.', 'met', 'yourself', 'certainly', 'point', 'minutes', 'lord', 'police', 'passed', 'work', 'many', 'large', 'money', 'hat', 'papers', 'set', 'under', 'baker', 'clear', 'call', 'pray', 'another', 'sure', 'looking', 'lay', 'gone', 'fire', 'table', 'whether', 'wish', 'help', 'life', 'hair', 'whole', 'home', 'gentleman', 'thing', 'being', 'possible', 'words', 'both', 'strange', 'gave', 'suddenly', 'also', 'already', 'leave', 'within', ':', 'understand', 'behind', 'black', 'anything', 'told', 'hardly', 'better', 'front', 'brought', 'son', 'looked', 'against', 'really', 'turned', 'place', 'end', 'got', 'enough', 'far', 'sat', 'chair', 'same', 'answered', 'dear', 'wife', 'always', 'knew', 'mind', 'between', 'course', 'give', 'great', 'name', 'indeed', 'open', 'perhaps', 'though', 'get', 'doubt', 'rather', 'again', 'remarked', 'years', 'something', 'hands', 'while', 'those', 'done', 'woman', 'head', 'business', 'old', 'window', 'cried', 'went', 'seemed', 'having', 'put', 'three', 'seen', 'ever', 'even', 'let', 'sir', 'look', 'off', 'himself', 'light', 'without', 'make', 'myself', 'night', 'until', 'street', 'few', 'why', 'thought', 'take', 'friend', 'lady', 'side', 'small', 'still', 'these', 'find', 'st.', 'every', 'watson', 'too', 'round', 'young', 'father', 'left', 'day', 'yet', 'first', 'once', 'took', 'its', 'eyes', 'long', 'miss', 'through', 'asked', 'most', 'saw', 'oh', 'morning', 'right', 'last', 'like', 'say', 'tell', 't', 'sherlock', 'their', 'go', 'own', 'after', 'away', 'never', 'good', 'nothing', 'case', 'however', 'quite', 'found', 'made', 'house', 'such', 'heard', 'way', 'yes', 'hand', 'much', 'matter', 'where', 'might', 'just', 'room', 'any', 'face', 'here', 'back', 'door', 'how', 'them', 'two', 'other', 'came', 'time', 'did', 'than', 'come', 'before', 'must', 'only', 'know', 'about', 'shall', 'think', 'more', 'over', 'us', 'well', 'am', 'or', 'may', 'they', ';', 'our', 'should', 'now', 'see', 'down', 'can', 'some', 'if', 'will', 'mr.', 'little', 'who', 'into', 'do', 'has', 'could', 'up', 'man', 'out', 'when', 'would', 'an', 'are', 'by', '!', 'were', 's', 'then', 'one', 'all', 'on', 'no', 'what', 'been', 'your', 'very', 'him', 'her', 'she', 'so', '‘', 'holmes', 'upon', 'this', 'said', 'from', 'there', 'we', 'me', 'be', 'but', 'not', 'for', '?', 'at', 'which', 'with', 'had', 'as', 'have', 'my', '’', 'is', 'his', 'was', 'you', 'he', 'it', 'that', 'in', '”', 'a', 'of', 'to', '“', 'and', 'i', '.', 'the', ',']\n",
      "452\n"
     ]
    }
   ],
   "source": [
    "text = read_text_file(\"../data/sherlock_holmes.txt\")\n",
    "stopwords = compile_stopwords_list_frequency(text, cut_off=0.05)\n",
    "print(stopwords)\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "6ccd9a01",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['away', 'never', 'good', 'nothing', 'case', 'however', 'quite', 'found', 'made', 'house', 'such', 'heard', 'way', 'yes', 'hand', 'much', 'matter', 'where', 'might', 'just', 'room', 'any', 'face', 'here', 'back', 'door', 'how', 'them', 'two', 'other', 'came', 'time', 'did', 'than', 'come', 'before', 'must', 'only', 'know', 'about', 'shall', 'think', 'more', 'over', 'us', 'well', 'am', 'or', 'may', 'they', ';', 'our', 'should', 'now', 'see', 'down', 'can', 'some', 'if', 'will', 'mr.', 'little', 'who', 'into', 'do', 'has', 'could', 'up', 'man', 'out', 'when', 'would', 'an', 'are', 'by', '!', 'were', 's', 'then', 'one', 'all', 'on', 'no', 'what', 'been', 'your', 'very', 'him', 'her', 'she', 'so', '‘', 'holmes', 'upon', 'this', 'said', 'from', 'there', 'we', 'me', 'be', 'but', 'not', 'for', '?', 'at', 'which', 'with', 'had', 'as', 'have', 'my', '’', 'is', 'his', 'was', 'you', 'he', 'it', 'that', 'in', '”', 'a', 'of', 'to', '“', 'and', 'i', '.', 'the', ',']\n",
      "131\n"
     ]
    }
   ],
   "source": [
    "stopwords = compile_stopwords_list_frequency(text, cut_off=100)\n",
    "print(stopwords)\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d1faa6ff",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
